#####
# Leader of the Pack?
# Changes in ‘Wolf Warrior Diplomacy’ After 
# A Politburo Collective Study Session

# Authors:
# Samuel Brazys
# Alexander Dukalskis
# Stefan Müller


# 01_run_lss.R loads all raw Twitter data, filters relevant tweets,
# runs LSS scaling, extracts most negative/positive sentences and 
# words and stores a tweet level dataset used in the subsequent
# empirical analysis
# Note: due to Twitter's data sharing policies, the raw tweets 
# cannot be shared publicly. Authors can retrieve the tweets, that have 
# not been deleted since we retrieved the data, using the rtweets package
# and the tweet ID variable
####


# load packages
library(tidyverse) # CRAN v1.3.1
library(LSX)       # CRAN v1.1.1
library(quanteda)  # CRAN v3.2.1
library(cowplot)   # CRAN v1.1.1
library(purrr)     # CRAN v0.3.4
library(lubridate) # CRAN v1.7.10
library(xtable)    # CRAN v1.8-4
library(here)      # CRAN v1.0.1

# set working directory (files where original Twitter data are stored)
setwd("/Users/smueller/Dropbox/papers/twitter_china/data_china/")

# load custom ggplot2 scheme

theme_baser <- function (){
    theme_minimal()  %+replace%
        theme(panel.grid.minor.x = element_blank(),
              panel.grid.minor.y = element_blank(),
              panel.grid.major.x = element_blank(),
              panel.grid.major.y = element_blank(),
              panel.border = element_rect(fill = NA,colour = "black", size = 0.5,
                                          linetype = "solid"),
              legend.title = element_text(size = 15),
              plot.title = element_text(size = 15, face = "bold",
                                        vjust = 1.5, hjust = 0.5,
                                        margin=margin(5, 5, 5 ,5)),
              plot.caption = element_text(colour = "grey30", size = 11, hjust = 1),
              legend.position = "bottom",
              axis.ticks.y = element_line(size = 0.3),
              axis.ticks.x = element_line(size = 0.3),
              axis.ticks.length = unit(0.2, "cm"),
              legend.text=element_text(size = 13),
              panel.background = element_rect(fill='white'), #transparent panel bg
              plot.background = element_rect(fill='white', color= "white"), #transparent plot bg
              strip.text = element_text(size = 15, hjust = 0.5,
                                        face = "bold",
                                        margin = margin(b = 5, r = 5, l = 5, t = 5)),
              axis.text.y = element_text(colour = "black", size = 13,
                                         hjust = 1),
              axis.text.x = element_text(colour = "black", size = 13),
              axis.title = element_text(size = 13, hjust = 0.5))
}


# set theme
theme_set(theme_baser())



# load all rds files containing tweets and bind to one data frame
# these data cannot be shared due to Twitter's T&Cs

# load all rds files and bind to one data frame
dat_merged <- list.files(pattern = ".rds") %>%
    map(readRDS) %>% 
    bind_rows()

max(dat_merged$created_at, na.rm = TRUE)

nrow(dat_merged)

# get data scraped on the first date and get the number of followers on that day
dat_first <- readRDS("data_twitter_china_2021-06-02_22-20-00.rds")

# get number of followers on date when tweets were scraped for the first time
dat_first_accounts <- dat_first %>% 
    select(screen_name, followers_count) %>% 
    unique()


# now select only the relevant variables, order by scrape date
# and keep the most recently scraped version of tweet
dat_select <- dat_merged %>% 
    select(is_retweet,
           reply_to_status_id,
           retweet_count, favorite_count,
           status_id, created_at, screen_name, text, lang) %>% 
    arrange(screen_name, created_at, favorite_count, retweet_count) %>% 
    group_by(screen_name, created_at, status_id) %>% 
    filter(row_number()==n()) %>% # keep each tweet only once by selecting the date when tweet was scraped for the last time
    ungroup()

# merge information on number of followers on 2 June
dat_select <- left_join(dat_select, dat_first_accounts,
                        by = "screen_name")

length(unique(dat_select$screen_name))

# only keep those accounts with tweets since 1 January 2021
dat_select_compare <- dat_select %>% 
    group_by(screen_name) %>% 
    mutate(first_tweet = min(created_at)) %>% 
    filter(first_tweet < "2020-01-01") %>% 
    filter(created_at >= "2020-01-01")

length(unique(dat_select_compare$screen_name))

min(dat_select_compare$created_at)
max(dat_select_compare$created_at)

dat_select_date <- dat_select_compare %>% 
    mutate(date = lubridate::floor_date(created_at, unit = "day")) %>% 
    mutate(month_date = lubridate::floor_date(created_at, unit = "month")) %>% 
    mutate(month = lubridate::month(date),
           year = lubridate::year(date)) %>% 
    group_by(screen_name, month, year, month_date) %>% 
    count()

# load metadata on diplomats and newspapers
# from https://demtech.oii.ox.ac.uk/wp-content/uploads/sites/127/2021/05/Chinas-Public-Diplomacy-Operations-Dem.Tech-Working-Paper-2021.1-4.pdf
dat_meta_diplomats <- read.csv("chinese_diplomats.csv") %>% 
    mutate(type = "Diplomat Accounts") %>% 
    select(type, type_diplomat = Type, Handle, account_created = Created, country = Country)

dat_meta_media <- read.csv("state_backed_accounts.csv") %>% 
    mutate(type = "State-Backed Media Accounts") %>% 
    select(type, Handle, account_created = Created)

# bind metadata on accounts
dat_meta <- bind_rows(dat_meta_media,
                      dat_meta_diplomats)

# harmonise strings of account names to merge meta data
dat_meta$screen_name <- str_remove_all(dat_meta$Handle, "\\*")
dat_meta$screen_name <- str_remove_all(dat_meta$screen_name, "@")
dat_meta$screen_name <- str_to_lower(dat_meta$screen_name)

head(dat_meta$screen_name)

# harmonise screen names in Twitter dataset
dat_select$screen_name <- str_to_lower(dat_select$screen_name)

# check whether some account are missing (yes, two!)
setdiff(dat_meta$screen_name, dat_select$screen_name)

# join meta data with scraped tweets
dat_full <- left_join(dat_select, dat_meta, by = "screen_name")

dat_full %>% 
    filter(created_at > "2020-12-31") %>% 
    select(screen_name, type) %>% 
    unique() %>% 
    group_by(type) %>% 
    count()


# get information on corpus
nrow(dat_full)
nrow(dat_select)
min(dat_full$created_at)
max(dat_full$created_at)

table(dat_full$type)


# exclude replies (and retweets)
nrow(dat_full)

dat_full_subset <- dat_full %>% 
    ungroup() %>% 
    filter(is_retweet == FALSE) %>% 
    filter(is.na(reply_to_status_id))

nrow(dat_full_subset)

# only select tweets since 1 Jan 2021
dat_2021 <- dat_full_subset %>% 
    filter(created_at >= as.Date("2021-01-01"))

nrow(dat_2021)

table(dat_2021$is_retweet)

min(dat_2021$created_at)

# apply LSS

# add non-English tweets here translate non-English tweets!!

dat_2021 <- dat_2021 %>% 
    mutate(english_dummy = ifelse(lang == "en", 1, 0))

prop.table(table(dat_2021$english_dummy))

# filter only English tweets
dat_2021 <- dat_2021 %>% 
    filter(lang == "en") # English only

# tokenize text corpus and remove various features
toks_2021 <- dat_2021 %>% 
    mutate(text = str_replace_all(text, "#", "")) %>%  # remove hashtag symbols
    corpus() %>% 
    tokens(remove_punct = TRUE, remove_symbols = TRUE, 
           remove_numbers = TRUE, remove_url = TRUE) %>%
    tokens_replace(pattern = phrase("United States"), replacement = "USA", case_insensitive = FALSE) %>% # change us to USA
    tokens_replace(pattern = "US", replacement = "USA", case_insensitive = FALSE) %>% # change us to USA
    tokens_remove(stopwords("en", source = "marimo"))  


# create a document feature matrix from the tokens object
dfmat_2021 <- toks_2021 %>% 
    dfm() %>% 
    dfm_remove(pattern = "") %>% # remove empty text
    dfm_trim(min_termfreq = 5, min_docfreq = 5)


# change working directory to folder for replication materials
setwd("/Users/smueller/Google Drive/papers/China_Twitter/replication_materials")

# load seed words
dat_seed <- read.csv("data_seedwords.csv", fileEncoding = "utf-8")

# get positive and negative seed words
dat_pos <- dat_seed %>% 
    select(friendly) %>% 
    filter(!is.na(friendly)) %>% 
    mutate(friendly = str_replace_all(friendly, " \\?\\?", ""))


dat_neg <- dat_seed %>% 
    select(wolfy) %>% 
    filter(!is.na(wolfy)) %>% 
    mutate(wolfy = str_replace_all(wolfy, " \\?\\?", ""))


data_dictionary_wolfy <- dictionary(
    list(pos = dat_pos$friendly,
         neg = dat_neg$wolfy)
)

data_dictionary_wolfy

seed <- as.seedwords(data_dictionary_wolfy)

seed

# run LSS model
# we are using package version ‘1.1.1’
tmod_lss <- textmodel_lss(dfmat_2021,
                          seeds = seed,
                          k = 300, 
                          cache = TRUE)


neg_100 <- head(coef(tmod_lss), 100) # most positive words according to LSS

pos_100 <- tail(coef(tmod_lss), 100) # most negative words according to LSS

# get scores for all words
dat_neg_pos <- data.frame(
    coef = coef(tmod_lss)
) %>% 
    arrange(-coef) %>% 
    unique()

# get term variable by accessing row names
dat_neg_pos$term <- rownames(dat_neg_pos)

n_print <- 100

# select most positive words
dat_neg_pos_top <- top_n(dat_neg_pos, n = n_print,
                         wt = coef)%>% 
    mutate(type_statement = "Positive")

head(dat_neg_pos_top)


# select most negative words
dat_neg_pos_bottom <- top_n(dat_neg_pos, n = -n_print,
                            wt = coef) %>% 
    mutate(type_statement = "Negative") %>% 
    arrange(coef) 

head(dat_neg_pos_bottom)

# create table with most postiive and negative words

dat_neg_combine <- dat_neg_pos_top %>% 
    summarise(`Highest Scores (Friendly)` = paste(term, collapse = ", "))

dat_pos_combine <- dat_neg_pos_bottom %>% 
    summarise(`Lowest Scores (Unfriendly)` = paste(term, collapse = ", "))

dat_pos_tab <- bind_cols(dat_pos_combine, dat_neg_combine)


print(xtable(dat_pos_tab),
      include.rownames = FALSE,
      type = "html",
      file = "table_words_100.html")

# predict wolfy score or all tweets
dat_2021$wolfy_score <- predict(tmod_lss, newdata = dfmat_2021)

length(unique(dat_2021$screen_name))

# count number of tokens after removing punctuation and 
# urls

ntoks <- dat_2021 %>% 
    corpus() %>% 
    tokens(remove_punct = TRUE,
           remove_url = TRUE) %>% 
    ntoken()

dat_2021$nwords <- ntoks

dat_2021_analysis <- dat_2021 %>% 
    mutate(date = as.Date(created_at)) %>% 
    select(wolfy_score, date, created_at,
           type, lang, text, nwords,
           is_retweet,
           type_diplomat, account_created, country,
           screen_name, followers_count,
           retweet_count, favorite_count, 
           account_created,
           status_id) %>% 
    arrange(type, screen_name, created_at) 

nrow(dat_2021_analysis)

# remove observations without wolfy score
dat_2021_analysis_valid <- dat_2021_analysis %>% 
    filter(!is.na(wolfy_score))

nrow(dat_2021_analysis_valid)


# recode countries to avoid errors in Stata
dat_2021_analysis_valid <- dat_2021_analysis_valid %>% 
    mutate(country = str_remove_all(country, "\\nRepublic of")) %>% 
    mutate(country = str_remove_all(country, ", Bolivarian")) %>% 
    mutate(country = str_remove_all(country, ", United")) %>%
    mutate(country = str_remove_all(country, ", Islamic Republic of")) %>% 
    mutate(country = str_remove_all(country, ", the Democratic Republic of the")) %>% 
    mutate(country = str_squish(country))

table(dat_2021_analysis_valid$country)


# get most friendly and most wolfy tweets (with at least 6 words)
dat_sentences_top <- dat_2021_analysis_valid %>% 
    filter(nwords > 6) %>% 
    arrange(-wolfy_score) %>%
    select(wolfy_score, text) %>% 
    unique() %>% 
    top_n(n = 30, wt = wolfy_score) %>% 
    mutate(type_statement = "Friendly")


dat_sentences_bottom <- dat_2021_analysis_valid %>% 
    filter(nwords > 6) %>% 
    arrange(wolfy_score) %>%
    select(wolfy_score, text) %>% 
    unique() %>% 
    top_n(n = 30, wt = -wolfy_score) %>% 
    mutate(type_statement = "Wolfy")



dat_sentences_bottom_top <- bind_rows(dat_sentences_bottom,
                                      dat_sentences_top) 


dat_sentences_bottom_top$wolfy_score <- round(dat_sentences_bottom_top$wolfy_score, 2)
 
# save output
print(xtable(dat_sentences_bottom_top), 
      type="html", file="examples_wolfy_friendly.html")


# remove text variable from data frame
dat_2021_analysis_valid_save <- select(dat_2021_analysis_valid, -text)

# transform tweet ID to factor
dat_2021_analysis_valid_save$status_id <- factor(dat_2021_analysis_valid_save$status_id)

# save data for subsequent analyses

write_csv(dat_2021_analysis_valid_save,
          "data_tweets_en.csv", na = ".")

